Make some paper figures¶
The next cell is tagged parameters for papermill parameterization:
In [1]:
# tagged parameters for `papermill`
In [2]:
# Parameters
params = {
"func_scores_293T-Mxra8-E3E2-B-241028-func-1": "results/func_scores/293T-Mxra8-E3E2-B-241028-func-1_func_scores.csv",
"func_scores_293T-Mxra8-E3E2-B-241028-func-2": "results/func_scores/293T-Mxra8-E3E2-B-241028-func-2_func_scores.csv",
"func_scores_293T-TIM1-E3E2-B-241028-func-1": "results/func_scores/293T-TIM1-E3E2-B-241028-func-1_func_scores.csv",
"func_scores_293T-TIM1-E3E2-B-241028-func-2": "results/func_scores/293T-TIM1-E3E2-B-241028-func-2_func_scores.csv",
"func_scores_293T-Mxra8-6KE1-B-241028-func-1": "results/func_scores/293T-Mxra8-6KE1-B-241028-func-1_func_scores.csv",
"func_scores_293T-Mxra8-6KE1-B-241028-func-2": "results/func_scores/293T-Mxra8-6KE1-B-241028-func-2_func_scores.csv",
"func_scores_293T-TIM1-6KE1-B-241028-func-1": "results/func_scores/293T-TIM1-6KE1-B-241028-func-1_func_scores.csv",
"func_scores_293T-TIM1-6KE1-B-241028-func-2": "results/func_scores/293T-TIM1-6KE1-B-241028-func-2_func_scores.csv",
"func_scores_293T-Mxra8-E3E2-A-241113-func-1": "results/func_scores/293T-Mxra8-E3E2-A-241113-func-1_func_scores.csv",
"func_scores_293T-Mxra8-E3E2-A-241113-func-2": "results/func_scores/293T-Mxra8-E3E2-A-241113-func-2_func_scores.csv",
"func_scores_293T-TIM1-E3E2-A-241113-func-1": "results/func_scores/293T-TIM1-E3E2-A-241113-func-1_func_scores.csv",
"func_scores_293T-TIM1-E3E2-A-241113-func-2": "results/func_scores/293T-TIM1-E3E2-A-241113-func-2_func_scores.csv",
"func_scores_293T-Mxra8-6KE1-A-241113-func-1": "results/func_scores/293T-Mxra8-6KE1-A-241113-func-1_func_scores.csv",
"func_scores_293T-Mxra8-6KE1-A-241113-func-2": "results/func_scores/293T-Mxra8-6KE1-A-241113-func-2_func_scores.csv",
"func_scores_293T-TIM1-6KE1-A-241113-func-1": "results/func_scores/293T-TIM1-6KE1-A-241113-func-1_func_scores.csv",
"func_scores_293T-TIM1-6KE1-A-241113-func-2": "results/func_scores/293T-TIM1-6KE1-A-241113-func-2_func_scores.csv",
"func_scores_C636-E3E2-B-241122-func-1": "results/func_scores/C636-E3E2-B-241122-func-1_func_scores.csv",
"func_scores_C636-E3E2-B-241122-func-2": "results/func_scores/C636-E3E2-B-241122-func-2_func_scores.csv",
"func_scores_C636-6KE1-B-241122-func-1": "results/func_scores/C636-6KE1-B-241122-func-1_func_scores.csv",
"func_scores_C636-6KE1-B-241122-func-2": "results/func_scores/C636-6KE1-B-241122-func-2_func_scores.csv",
"func_scores_C636-E3E2-A-241212-func-1": "results/func_scores/C636-E3E2-A-241212-func-1_func_scores.csv",
"func_scores_C636-E3E2-A-241212-func-2": "results/func_scores/C636-E3E2-A-241212-func-2_func_scores.csv",
"func_scores_C636-6KE1-A-241212-func-1": "results/func_scores/C636-6KE1-A-241212-func-1_func_scores.csv",
"func_scores_C636-6KE1-A-241212-func-2": "results/func_scores/C636-6KE1-A-241212-func-2_func_scores.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-A-241113-func-1": "results/func_effects/by_selection/293T-Mxra8-E3E2-A-241113-func-1_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-A-241113-func-2": "results/func_effects/by_selection/293T-Mxra8-E3E2-A-241113-func-2_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-B-241028-func-1": "results/func_effects/by_selection/293T-Mxra8-E3E2-B-241028-func-1_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-E3E2-B-241028-func-2": "results/func_effects/by_selection/293T-Mxra8-E3E2-B-241028-func-2_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-A-241113-func-1": "results/func_effects/by_selection/293T-Mxra8-6KE1-A-241113-func-1_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-A-241113-func-2": "results/func_effects/by_selection/293T-Mxra8-6KE1-A-241113-func-2_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-B-241028-func-1": "results/func_effects/by_selection/293T-Mxra8-6KE1-B-241028-func-1_func_effects.csv",
"func_effects_293T-Mxra8_entry_293T-Mxra8-6KE1-B-241028-func-2": "results/func_effects/by_selection/293T-Mxra8-6KE1-B-241028-func-2_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-E3E2-A-241113-func-1": "results/func_effects/by_selection/293T-TIM1-E3E2-A-241113-func-1_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-E3E2-A-241113-func-2": "results/func_effects/by_selection/293T-TIM1-E3E2-A-241113-func-2_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-E3E2-B-241028-func-1": "results/func_effects/by_selection/293T-TIM1-E3E2-B-241028-func-1_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-E3E2-B-241028-func-2": "results/func_effects/by_selection/293T-TIM1-E3E2-B-241028-func-2_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-6KE1-A-241113-func-1": "results/func_effects/by_selection/293T-TIM1-6KE1-A-241113-func-1_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-6KE1-A-241113-func-2": "results/func_effects/by_selection/293T-TIM1-6KE1-A-241113-func-2_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-6KE1-B-241028-func-1": "results/func_effects/by_selection/293T-TIM1-6KE1-B-241028-func-1_func_effects.csv",
"func_effects_293T-TIM1_entry_293T-TIM1-6KE1-B-241028-func-2": "results/func_effects/by_selection/293T-TIM1-6KE1-B-241028-func-2_func_effects.csv",
"func_effects_C636_entry_C636-E3E2-B-241122-func-1": "results/func_effects/by_selection/C636-E3E2-B-241122-func-1_func_effects.csv",
"func_effects_C636_entry_C636-E3E2-B-241122-func-2": "results/func_effects/by_selection/C636-E3E2-B-241122-func-2_func_effects.csv",
"func_effects_C636_entry_C636-6KE1-B-241122-func-1": "results/func_effects/by_selection/C636-6KE1-B-241122-func-1_func_effects.csv",
"func_effects_C636_entry_C636-6KE1-B-241122-func-2": "results/func_effects/by_selection/C636-6KE1-B-241122-func-2_func_effects.csv",
"func_effects_C636_entry_C636-E3E2-A-241212-func-1": "results/func_effects/by_selection/C636-E3E2-A-241212-func-1_func_effects.csv",
"func_effects_C636_entry_C636-E3E2-A-241212-func-2": "results/func_effects/by_selection/C636-E3E2-A-241212-func-2_func_effects.csv",
"func_effects_C636_entry_C636-6KE1-A-241212-func-1": "results/func_effects/by_selection/C636-6KE1-A-241212-func-1_func_effects.csv",
"func_effects_C636_entry_C636-6KE1-A-241212-func-2": "results/func_effects/by_selection/C636-6KE1-A-241212-func-2_func_effects.csv",
"codon_variants": "results/variants/codon_variants.csv",
"annotated_mut_summary": "results/annotated_summary_csvs/entry_293T-Mxra8_C636_293T-TIM1_Mxra8-binding_annotated.csv",
"annotated_site_summary": "results/annotated_summary_csvs/entry_293T-Mxra8_C636_293T-TIM1_Mxra8-binding_annotated_site_means.csv",
"nb": "notebooks/paper_figures.ipynb",
}
min_times_seen = 2
Python imports:
In [3]:
import itertools
import os
import altair as alt
import dms_variants.codonvarianttable
import numpy
import pandas as pd
import scipy.stats
_ = alt.data_transformers.disable_max_rows()
Distribution of variant functional scores¶
We want the distribution of variant functional scores, similar to as made by this notebook but including both E3-E2 and 6K-E1 fragments and not including deletions (since those are rare in our libraries).
First read all the functional scores, ignoring those for deletions since those are rare and not reported in paper:
In [4]:
def classify_selection(sel):
sels = {"293T-Mxra8": "293T-Mxra8", "293T-TIM1":"293T-TIM1", "C636":"C6/36"}
assert sum(s in sel for s in sels) == 1, sel
for s in sels:
if s in sel:
label = [sels[s]]
libs = {"-A-": "library A", "-B-": "library B"}
assert sum(l in sel for l in libs) == 1, sel
for l in libs:
if l in sel:
label.append(libs[l])
return " ".join(label)
func_scores_df = (
pd.concat(
[
pd.read_csv(f).assign(selection=sel)
for (sel, f) in params.items() if sel.startswith("func_scores_")
]
)
.assign(selection=lambda x: x["selection"].map(classify_selection))
.pipe(dms_variants.codonvarianttable.CodonVariantTable.classifyVariants)
.query("variant_class != 'deletion'")
)
(
func_scores_df
.groupby(["selection", "variant_class"])
.aggregate(n_variants=pd.NamedAgg("barcode", "count"))
)
Out[4]:
| n_variants | ||
|---|---|---|
| selection | variant_class | |
| 293T-Mxra8 library A | 1 nonsynonymous | 142665 |
| >1 nonsynonymous | 79744 | |
| stop | 5778 | |
| synonymous | 4245 | |
| wildtype | 29406 | |
| 293T-Mxra8 library B | 1 nonsynonymous | 132342 |
| >1 nonsynonymous | 73868 | |
| stop | 5855 | |
| synonymous | 4048 | |
| wildtype | 27337 | |
| 293T-TIM1 library A | 1 nonsynonymous | 142805 |
| >1 nonsynonymous | 79859 | |
| stop | 5789 | |
| synonymous | 4237 | |
| wildtype | 29411 | |
| 293T-TIM1 library B | 1 nonsynonymous | 132443 |
| >1 nonsynonymous | 73929 | |
| stop | 5863 | |
| synonymous | 4048 | |
| wildtype | 27349 | |
| C6/36 library A | 1 nonsynonymous | 140546 |
| >1 nonsynonymous | 78516 | |
| stop | 5670 | |
| synonymous | 4153 | |
| wildtype | 28913 | |
| C6/36 library B | 1 nonsynonymous | 131133 |
| >1 nonsynonymous | 73139 | |
| stop | 5797 | |
| synonymous | 3999 | |
| wildtype | 27056 |
Make the plot:
In [5]:
def ridgeplot(df):
variant_classes = list(
reversed(
[
c
for c in [
"wildtype",
"synonymous",
"1 nonsynonymous",
">1 nonsynonymous",
"deletion",
"stop",
]
if c in set(df["variant_class"])
]
)
)
assert set(df["variant_class"]) == set(variant_classes)
# get smoothed distribution of functional scores
bins = numpy.linspace(
df["func_score"].min(),
df["func_score"].max(),
num=50,
)
smoothed_dist = pd.concat(
[
pd.DataFrame(
{
"selection": sel,
"variant_class": var,
"func_score": bins,
"count": scipy.stats.gaussian_kde(df["func_score"])(bins),
"mean_func_score": df["func_score"].mean(),
"number of variants": len(df),
}
)
for (sel, var), df in df.groupby(["selection", "variant_class"])
]
)
# assign y / y2 for plotting
facet_overlap = 0.7 # maximal facet overlap
max_count = (smoothed_dist["count"]).max()
smoothed_dist = smoothed_dist.assign(
y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
)
# ridgeline plot, based on this but using y / y2 rather than row:
# https://altair-viz.github.io/gallery/ridgeline_plot.html
ridgeline_chart = (
alt.Chart(smoothed_dist)
.encode(
x=alt.X(
"func_score", title="functional score for cell entry", scale=alt.Scale(nice=False)
),
y=alt.Y(
"y",
scale=alt.Scale(nice=False),
title=None,
axis=alt.Axis(
ticks=False,
domain=False,
# set manual labels https://stackoverflow.com/a/64106056
values=[v + 0.5 for v in range(len(variant_classes))],
labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
),
),
y2=alt.Y2("y2"),
fill=alt.Fill(
"mean_func_score:Q",
title="mean functional score",
legend=alt.Legend(direction="horizontal"),
scale=alt.Scale(scheme="yellowgreenblue"),
),
facet=alt.Facet(
"selection",
columns=2,
title=None,
header=alt.Header(
labelFontWeight="bold",
labelPadding=0,
),
),
tooltip=[
"selection",
"variant_class",
alt.Tooltip(
"mean_func_score", format=".2f", title="mean functional score"
),
],
)
.mark_area(
interpolate="monotone",
smooth=True,
fillOpacity=0.8,
stroke="lightgray",
strokeWidth=0.5,
)
.configure_view(stroke=None)
.configure_axis(grid=False)
.properties(width=180, height=22 * len(variant_classes))
)
ridgeline_chart = ridgeline_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
return ridgeline_chart
func_scores_chart = ridgeplot(func_scores_df)
func_scores_chart
Out[5]:
In [6]:
codon_variants = pd.read_csv(params["codon_variants"])
lib_rename = {"E_A": "library A", "E_B": "library B"}
display(
codon_variants
.groupby("library")
.aggregate(n_variants=pd.NamedAgg("barcode", "count"))
)
max_muts = 4
print(f"Only keeping {lib_rename=}, and clipping at {max_muts=}")
nmuts_dist = (
codon_variants
.query("library in @lib_rename")
.assign(
library=lambda x: x["library"].map(lib_rename),
n_muts=lambda x: x["n_aa_substitutions"].clip(upper=max_muts),
)
.groupby(["library", "n_muts"], as_index=False)
.aggregate(n_variants=pd.NamedAgg("barcode", "count"))
.assign(
n_muts_label=lambda x: x["n_muts"].map(
lambda n: str(n) if n < max_muts else f">{n - 1}"
)
)
)
nmuts_dist_chart = (
alt.Chart(nmuts_dist)
.encode(
alt.X(
"n_muts_label",
sort=alt.SortField("n_muts"),
title="number amino-acid mutations",
axis=alt.Axis(labelAngle=0),
),
alt.Y("n_variants", title="number of barcoded variants"),
alt.Column(
"library",
title=None,
header=alt.Header(labelFontStyle="bold", labelFontSize=11, labelPadding=0),
),
)
.mark_bar(color="black")
.configure_axis(grid=False)
.properties(height=150, width=150)
)
nmuts_dist_chart
| n_variants | |
|---|---|
| library | |
| 6KE1_A | 69359 |
| 6KE1_B | 62446 |
| E3E2_A | 65426 |
| E3E2_B | 62495 |
| E_A | 134757 |
| E_B | 124915 |
Only keeping lib_rename={'E_A': 'library A', 'E_B': 'library B'}, and clipping at max_muts=4
Out[6]:
In [7]:
func_effects_by_lib = (
pd.concat(
[
pd.read_csv(f).assign(
name=name,
cell=name.split("_")[2],
replicate=name.split("-")[-1],
library="library A" if "-A-" in name else "library B",
region="E3E2" if "E3E2" in name else "6KE1",
)
for (name, f) in params.items() if name.startswith("func_effects_")
],
ignore_index=True,
)
.query("wildtype != mutant")
.query("times_seen >= @min_times_seen")
.assign(
mut_in_region=lambda x: x.apply(
lambda r: (
("6K" in r["site"] or "E1" in r["site"]) and (r["region"] == "6KE1")
or ("E2" in r["site"] or "E2" in r["site"]) and (r["region"] == "E3E2")
),
axis=1,
),
)
.query("mut_in_region")
.assign(
library_replicate=lambda x: x["library"] + ", replicate " + x["replicate"],
cell=lambda x: x["cell"].map(
{
"C636": "entry in C6/36 cells",
"293T-Mxra8": "entry in 293T-Mxra8 cells",
"293T-TIM1": "entry in 293T-TIM1 cells",
}
),
)
[["site", "wildtype", "mutant", "functional_effect", "library_replicate", "cell"]]
)
func_effects_by_lib
Out[7]:
| site | wildtype | mutant | functional_effect | library_replicate | cell | |
|---|---|---|---|---|---|---|
| 1346 | 1(E2) | S | - | -0.98530 | library A, replicate 1 | entry in 293T-Mxra8 cells |
| 1347 | 1(E2) | S | A | 0.24300 | library A, replicate 1 | entry in 293T-Mxra8 cells |
| 1348 | 1(E2) | S | C | -1.91700 | library A, replicate 1 | entry in 293T-Mxra8 cells |
| 1349 | 1(E2) | S | D | -0.17820 | library A, replicate 1 | entry in 293T-Mxra8 cells |
| 1350 | 1(E2) | S | E | -0.71480 | library A, replicate 1 | entry in 293T-Mxra8 cells |
| ... | ... | ... | ... | ... | ... | ... |
| 306391 | 439(E1) | H | Y | -0.37860 | library A, replicate 2 | entry in C6/36 cells |
| 306393 | 440(E1) | * | K | 0.06401 | library A, replicate 2 | entry in C6/36 cells |
| 306394 | 440(E1) | * | Q | -2.44000 | library A, replicate 2 | entry in C6/36 cells |
| 306395 | 440(E1) | * | W | -0.49750 | library A, replicate 2 | entry in C6/36 cells |
| 306396 | 440(E1) | * | Y | -0.23450 | library A, replicate 2 | entry in C6/36 cells |
210846 rows × 6 columns
Now make the plot:
In [8]:
for cell, cell_df in func_effects_by_lib.groupby("cell"):
corr_panels = []
for sel1, sel2 in itertools.combinations(sorted(cell_df["library_replicate"].unique()), 2):
corr_df = (
cell_df.query("library_replicate == @sel1")[["functional_effect", "site", "mutant"]]
.rename(columns={"functional_effect": sel1})
.merge(
cell_df.query("library_replicate == @sel2")[["functional_effect", "site", "mutant"]].rename(
columns={"functional_effect": sel2}
),
validate="one_to_one",
)
.drop(columns=["site", "mutant"])
)
n = len(corr_df)
r = corr_df[[sel1, sel2]].corr().values[1, 0]
corr_panels.append(
alt.Chart(corr_df)
.encode(
alt.X(sel1, scale=alt.Scale(nice=False, padding=4)),
alt.Y(sel2, scale=alt.Scale(nice=False, padding=4)),
)
.mark_circle(color="black", size=25, opacity=0.15)
.properties(
width=135,
height=135,
title=alt.TitleParams(
f"R = {r:.2f}, N = {n}", fontSize=11, fontWeight="normal", dy=2
),
)
)
corr_chart = alt.hconcat(*corr_panels, spacing=5).configure_axis(grid=False).properties(
title=alt.TitleParams(f"correlations among libraries and replicates for mutations effects on {cell}", anchor="middle")
)
display(corr_chart)